#pragma once

#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <mutex>
#include "util.hpp"
#include "log.hpp"

namespace ns_index
{
    struct DocInfo
    {
        std::string title;   //文档标题
        std::string content; //文档内容
        std::string url;     //官方文档url
        uint64_t doc_id;     //文档ID
    };

    struct InvertedElem
    {
        uint64_t doc_id;   //
        std::string word;  //
        int weight;        //
        InvertedElem():weight(0){}
    };

    //倒排拉链
    typedef std::vector<InvertedElem> InvertedElemList;

    class Index
    {
    private:
        //正排索引的数据结构使用数组，数组下标天然就是文档ID
        std::vector<DocInfo>  forward_index;
        //倒排索引一定是一个关键字和一组(个)InvertedElem对应【关键字和倒排拉链的映射关系】
        std::unordered_map<std::string, InvertedElemList> inverted_index;
    private:
        Index(){}
        Index(const Index&) = delete;
        Index& operator=(const Index&) = delete;

        static Index* instance;
        static std::mutex mtx;
    public:
        static Index* GetInstance()
        {
            if(nullptr == instance)
            {
                mtx.lock();
                if(nullptr == instance)
                {
                    instance = new Index();
                }
                mtx.unlock();
            }
            
            return instance;
        } 
        ~Index()
        {}
    public:
        //根据doc_id找到找到文档内容
        DocInfo *GetForwardIndex(uint64_t doc_id)
        {
            if(doc_id >= forward_index.size())
            {
                std::cerr << "doc_id out range,error" << std::endl;
                return nullptr;
            }

            return &forward_index[doc_id];
        }

        //根据关键字string，获得倒排拉链
        InvertedElemList *GetInvertedElemList(std::string &word)
        {
            auto it = inverted_index.find(word);
            if(it == inverted_index.end())
            {
                std::cerr << word << " no exist" << std::endl;
                return nullptr;
            }

            return &(it->second);
        }

        //根据去标签，格式化之后的文档，构建正排和倒排索引
        //data/raw_html/raw.txt
        bool BuildIndex(const std::string &input)
        {
            std::fstream in(input, std::ios::in | std::ios::binary);
            if(!in.is_open())
            {
                std::cerr << input << "no exist" << std::endl;
                return false;
            }

            std::string line;
            int cnt = 0;
            while(std::getline(in, line))
            {
                DocInfo *doc = BuildForwardIndex(line);
                if(nullptr == doc)
                {
                    std::cerr << "build forward index error" << std::endl;
                    continue;
                }

                BuildInvertedIndex(*doc);
                cnt++;
                if(cnt % 50 == 0)
                {
                    //std::cout << "当前已经建立的索引文档： " << cnt << std::endl;
                    LOG(NORMAL, "当前已经建立的索引文档： " + std::to_string(cnt));
                }
            } 

            return true;
        }

    private:
        DocInfo *BuildForwardIndex(const std::string &line)
        {
            
            //1.解析line，字符串切分
            std::vector<std::string> results;
            const std::string sep = "\3";
            ns_util::StringUtil::Split(line, &results, sep);
            if(results.size() != 3)
            {
                return nullptr;
            }

            //2.字符串进行填充到DocInfo
            DocInfo doc;
            doc.title = results[0];
            doc.content = results[1];
            doc.url = results[2];
            doc.doc_id = forward_index.size();

            //3. 插入到正排索引的vector
            forward_index.push_back(doc);
            return &forward_index.back();
        }

        bool BuildInvertedIndex(const DocInfo &doc)
        {
            struct word_cnt
            {
                int title_cnt;
                int content_cnt;

                word_cnt():title_cnt(0), content_cnt(0){}
            };
            std::unordered_map<std::string, word_cnt> word_map;  //用来暂存词频的映射表

            std::vector<std::string> title_words;
            ns_util::JiebaUtil::CutString(doc.title, &title_words);

            for(auto &s : title_words)
            {
                boost::to_lower(s); //统一转换为小写
                word_map[s].title_cnt++;
            }
            
            //对文档内容进行分词
            std::vector<std::string> content_words;
            ns_util::JiebaUtil::CutString(doc.content, &content_words);

            //对内容进行词频统计
            for(auto &s : content_words)
            {
                boost::to_lower(s); //统一转换为小写
                word_map[s].content_cnt++;
            }
#define X 10
#define Y 1
            for(auto &word_pair : word_map)
            {
                InvertedElem item;
                item.doc_id = doc.doc_id;
                item.word = word_pair.first;
                item.weight = X*word_pair.second.title_cnt + Y*word_pair.second.content_cnt;  //相关性
                InvertedElemList &inverted_list = inverted_index[word_pair.first];
                inverted_list.push_back(std::move(item));
            }
             
            return true;
        }
    };
    Index* Index::instance = nullptr;
    std::mutex Index::mtx;
}
